In this notebook we are going to create a data quality report piece-by-piece and then put it all together at the end.
In [ ]:
# Import the libraries we need
import pandas as pd
In [ ]:
# Import the data from the source CSV file the 'Create a Pandas DataFrame From a CSV File' recipe and take a peek
accidents_data_file = '/Users/robert.dempsey/Dropbox/Private/Art of Skill Hacking/' \
'Books/Python Business Intelligence Cookbook/Data/Stats19-Data1979-2004/Accidents7904.csv'
accidents = pd.read_csv(accidents_data_file,
sep=',',
header=0,
index_col=False,
parse_dates=['Date'],
dayfirst=True,
tupleize_cols=False,
error_bad_lines=True,
warn_bad_lines=True,
skip_blank_lines=True,
low_memory=False
)
accidents.head()
In [ ]:
# Create a DataFrame of the columns in the accidents dataframe
columns = pd.DataFrame(list(accidents.columns.values))
columns
In [ ]:
# Create a DataFrame of the data type of each column
data_types = pd.DataFrame(accidents.dtypes,
columns=['Data Type'])
data_types
In [ ]:
# Create a DataFrame with the count of missing values in each column
missing_data_counts = pd.DataFrame(accidents.isnull().sum(),
columns=['Missing Values'])
missing_data_counts
In [ ]:
# Create a DataFrame with the count of present values in each column
present_data_counts = pd.DataFrame(accidents.count(),
columns=['Present Values'])
present_data_counts
In [ ]:
# Create DataFrame with the count of unique values in each column
unique_value_counts = pd.DataFrame(columns=['Unique Values'])
for v in list(accidents.columns.values):
unique_value_counts.loc[v] = [accidents[v].nunique()]
unique_value_counts
In [ ]:
# Create a DataFrame with the minimum value in each column
minimum_values = pd.DataFrame(columns=['Minimum Value'])
for v in list(accidents.columns.values):
minimum_values.loc[v] = [accidents[v].min()]
minimum_values
In [ ]:
# Create a DataFrame with the minimum value in each column
maximum_values = pd.DataFrame(columns=['Maximum Value'])
for v in list(accidents.columns.values):
maximum_values.loc[v] = [accidents[v].max()]
maximum_values
In [ ]:
# Merge all the DataFrames together by the index
data_quality_report = data_types.join(present_data_counts).join(missing_data_counts).join(unique_value_counts)
In [ ]:
# Print out a nice report
print("\nData Quality Report")
print("Total records: {}".format(len(accidents.index)))
data_quality_report
In [ ]: